    ---
    title: "Apply ALPACA to COAD subtype networks"
    output: html_notebook
    ---

    We start by importing the relevant packages, including netzoor

    ```{r}
    library(netZooR)
    library(tidyverse)

# Change to your organism
library(clusterProfiler)
library(purrr)
library(org.Hs.eg.db)  # Human gene annotation database
```
Use the ALPACA algorithm to infer the network for each COAD subtype. 
You can use the run_alpaca.r script to run ALPACA between two networks 
and you have three outputs: the membership table, the alpaca object and the list of top genes.

Now we will read the membership table for the CMS2 and CMS4 subtypes.

```{r}
outputDir <- "../results/alpaca-batch-coad-subtype-connnectivity-20250317/"

```

We read the membership table for the CMS2 and CMS4 subtypes.
```{r}
# Read back the membership table
membership_cms2_cms4_tibble <- read_csv("../data/processed/alpaca-batch-coad-subtype-20240510/membership_csm2_csm4.csv")
membership_cms2_cms4_tibble
```

And now we want to check the number of nodes in each module.
```{r}
count_total <- membership_cms2_cms4_tibble %>%
  count(module, name = "node_count")
  
# Count nodes ending in _A (tf)
count_A <- membership_cms2_cms4_tibble %>%
  filter(grepl("_A$", node)) %>%
  count(module, name = "count_tf")

# Count nodes ending in _B (genes)
count_B <- membership_cms2_cms4_tibble %>%
  filter(grepl("_B$", node)) %>%
  count(module, name = "count_gene")
  
# Join the two counts
count_A_B <- full_join(count_A, count_B, by = "module")

count_A_B <- full_join(count_A_B, count_total, by = "module")

# SAve the table
write_csv(count_A_B, file = paste(outputDir, 'cms2_cms4_nodes_per_cluster.csv', sep = ""))
count_A_B

```

```{r}
library(RColorBrewer)

```
We also plot the Number of nodes in each module

```{r}
# draw histogram of the percentage of nodes in each module
count_A_B %>%
dplyr::select(module, count_tf, count_gene) %>%
  gather(key = "type", value = "count", -module) %>%
  ggplot(aes(x = module, y = count, fill = type)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Module", y = "Number of nodes", fill = "Type")+
  # change colors to Set2
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "top")

  ggsave(file = paste(outputDir, 'cms2_cms4_nodes_per_cluster.pdf', sep = ""), width = 6, height = 4)
```

Check the percentage of nodes in each module. 
This shows how big compared to the total number of nodes in the network are the nodes in each module.\
We also normalize for the number of tf and genes, such that TFs and genes are comparable.

```{r}
# do the same but with percentage respective to the total number of A nodes and B nodes
# fill NA with 0
count_A_B[is.na(count_A_B)] <- 0
count_A_B$percentage_tf = count_A_B$count_tf / sum(count_A_B$count_tf) * 100

count_A_B$percentage_gene = count_A_B$count_gene / sum(count_A_B$count_gene) * 100

count_A_B %>%
    dplyr::select(module, percentage_tf, percentage_gene) %>%
  gather(key = "type", value = "percentage", -module) %>%
  ggplot(aes(x = factor(module), y = percentage, fill = type)) +
  # show all xlabel
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
  labs(x = "Module", y = "Number of nodes", fill = "Type")+
  # change colors to Set2
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "top")

  ggsave(file = paste(outputDir, 'cms2_cms4_percentage_nodes_per_cluster.pdf', sep = ""), width = 6, height = 4)

```

We also reformat the membership table to have the node name and the type in separate columns.
```{r}
# Split node column in the node_name and the type split by "_"
membership_cms2_cms4_tibble <- membership_cms2_cms4_tibble %>%
  separate(node, into = c("node_name", "type"), sep = "_", remove = FALSE)
membership_cms2_cms4_tibble
```

### We do pathway analysis on the genes in each module

```{r}
library(dplyr)
library(stringr)

```



```{r}
df_grouped_string <- membership_cms2_cms4_tibble %>%
    filter(grepl("_B$", node)) %>%
  mutate(node = str_remove(node, "_B")) %>%
  group_by(module) %>%
  summarise(nodes = list(node))
```

```{r}
tf_grouped_string <- membership_cms2_cms4_tibble %>%
    filter(grepl("_A$", node)) %>%
  mutate(node = str_remove(node, "_A")) %>%
  group_by(module) %>%
  summarise(nodes = list(node))
```

Here are the functions to run GO/KEGG/REACTOME analysis on the genes in each module

```{r}
# Function to perform enrichment analysis per module
perform_enrichment <- function(genes, module) {
  # Convert ENSEMBL to ENTREZ
  genes_entrez <- bitr(genes, fromType="ENSEMBL", toType="ENTREZID", OrgDb=org.Hs.eg.db)

  # If no valid ENTREZ IDs are found, return NULL
  if (nrow(genes_entrez) == 0) return(NULL)

  # Perform GO enrichment analysis
  ego <- enrichGO(
    gene          = genes_entrez$ENTREZID,
    OrgDb         = org.Hs.eg.db,
    keyType       = "ENTREZID",
    ont           = "BP",  # Biological Process
    pAdjustMethod = "BH",
    pvalueCutoff  = 1,
    qvalueCutoff  = 1
  )

  # Add module information and return results as a dataframe
  if (!is.null(ego)) {
    ego_df <- as.data.frame(ego)
    ego_df$module <- module  # Add module ID
    return(ego_df)
  } else {
    print('No valid ENTREZ IDs found for module')
    return(NULL)
  }
}
```

```{r}
# Function to perform enrichment analysis per module
perform_kegg_enrichment <- function(genes, module) {
  # Convert ENSEMBL to ENTREZ
  genes_entrez <- bitr(genes, fromType="ENSEMBL", toType="ENTREZID", OrgDb=org.Hs.eg.db)

  # If no valid ENTREZ IDs are found, return NULL
  if (nrow(genes_entrez) == 0) return(NULL)

  # Perform GO enrichment analysis
  ego <- enrichKEGG(
    gene          = genes_entrez$ENTREZID,
    organism = 'hsa',
    pAdjustMethod = "BH",
    pvalueCutoff  = 1,
    qvalueCutoff  = 1
  )

  # Add module information and return results as a dataframe
  if (!is.null(ego)) {
    ego_df <- as.data.frame(ego)
    ego_df$module <- module  # Add module ID
    return(ego_df)
  } else {
    print('No valid ENTREZ IDs found for module')
    return(NULL)
  }
}


```

```{r}
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("ReactomePA")
```
```{r}
library(ReactomePA)
# Function to perform enrichment analysis per module
perform_reactome_enrichment <- function(genes, module) {
  # Convert ENSEMBL to ENTREZ
  genes_entrez <- bitr(genes, fromType="ENSEMBL", toType="ENTREZID", OrgDb=org.Hs.eg.db)

  # If no valid ENTREZ IDs are found, return NULL
  if (nrow(genes_entrez) == 0) return(NULL)

    ego <- enrichPathway(gene=genes_entrez$ENTREZID, 
    pAdjustMethod = "BH",
    pvalueCutoff  = 1,
    qvalueCutoff  = 1
            , readable=TRUE)

  # Add module information and return results as a dataframe
  if (!is.null(ego)) {
    ego_df <- as.data.frame(ego)
    ego_df$module <- module  # Add module ID
    return(ego_df)
  } else {
    print('No valid ENTREZ IDs found for module')
    return(NULL)
  }
}
```


```{r}
# Iterated over the list of genes in each module and perform pathway analysis and concatenate the results table

# Iterate over each module and perform enrichment analysis
all_results <- map_dfr(df_grouped_string$module, function(module) {
  genes <- unlist(df_grouped_string$nodes[df_grouped_string$module == module])
  if (length(genes)>10){
  print(paste('module',module))
  perform_enrichment(genes, module)
  } else {
     return(NULL)
  }

})

# Print combined results
print(all_results)
all_results %>% write_tsv(file = paste(outputDir, 'cms2_cms4_pathway_go.tsv', sep = ""))
```

```{r}
all_result_significant <- all_results %>%
  filter(p.adjust < 0.05) %>%
  arrange(p.adjust)
print(all_result_significant)
```


```{r, fig.width=10, fig.height=6}
library(ggplot2)
library(dplyr)

# Select the top 10 pathways per module
top_results <- all_result_significant %>%
  group_by(module) %>%
  slice_min(pvalue, n = 10) %>%  # Get the top 10 pathways per module
  ungroup()

# Convert module to factor for better visualization
top_results$module <- as.factor(top_results$module)

options(repr.plot.width = 14, repr.plot.height = 6)  # Wider plot

# Create grouped bar plot
ggplot(top_results, aes(x = reorder(Description, -pvalue), y = -log10(pvalue), fill = module)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  facet_wrap(~ module, scales = "free_y", ncol = 3) +  # Separate plots per module
  labs(
    title = "Top 10 Enriched Pathways per Module",
    x = "Pathway",
    y = "-log10(p-value)",
    fill = "Module"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 11),
    axis.title = element_text(size = 11),
    legend.position = "right"
  ) #+
  #scale_fill_brewer(palette = "Set3")  # Use categorical colors



# Save the plot as a wide image
ggsave(file = paste(outputDir, 'top10_gos_per_module.pdf', sep = ""), width = 18, height = 18)

```

### Reactome



```{r}
# Iterated over the list of genes in each module and perform pathway analysis and concatenate the results table

# Iterate over each module and perform enrichment analysis
reactome_results <- map_dfr(df_grouped_string$module, function(module) {
  genes <- unlist(df_grouped_string$nodes[df_grouped_string$module == module])
  if (length(genes)>10){
  print(paste('module',module))
  perform_reactome_enrichment(genes, module)
  } else {
     return(NULL)
  }

})

# Print combined results
print(reactome_results)
reactome_results %>% write_tsv(file = paste(outputDir, 'cms2_cms4_pathway_reactome.tsv', sep = ""))
```

```{r}
all_reactome_result_significant <- reactome_results %>%
  filter(p.adjust < 0.2) %>%
  arrange(p.adjust)
print(all_reactome_result_significant)
```


```{r, fig.width=12, fig.height=6}
library(ggplot2)
library(dplyr)

# Select the top 10 pathways per module
top_results <- all_reactome_result_significant %>%
  group_by(module) %>%
  slice_min(pvalue, n = 15) %>%  # Get the top 10 pathways per module
  ungroup()

# Convert module to factor for better visualization
top_results$module <- as.factor(top_results$module)

options(repr.plot.width = 14, repr.plot.height = 6)  # Wider plot

# Create grouped bar plot
ggplot(top_results, aes(x = reorder(Description, -pvalue), y = -log10(pvalue), fill = module)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  facet_wrap(~ module, scales = "free_y", ncol = 3) +  # Separate plots per module
  labs(
    title = "Top 10 Enriched Pathways per Module",
    x = "Pathway",
    y = "-log10(p-value)",
    fill = "Module"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 11),
    axis.title = element_text(size =11),
    legend.position = "right"
  ) #+
  #scale_fill_brewer(palette = "Set3")  # Use categorical colors

# Save the plot as a wide image
ggsave(file = paste(outputDir, 'top10_reactomes_per_module.pdf', sep = ""), width = 30, height = 15, dpi = 300)
```


### KEGG

```{r}
# Iterate over each module and perform enrichment analysis
all_kegg_results <- map_dfr(df_grouped_string$module, function(module) {
  genes <- unlist(df_grouped_string$nodes[df_grouped_string$module == module])
  if (length(genes)>10){
  print(paste('module',module))
  perform_kegg_enrichment(genes, module)
  } else {
     return(NULL)
  }
})

# Print combined results
print(all_kegg_results)
all_kegg_results %>% write_tsv(file = paste(outputDir, 'cms2_cms4_pathway_kegg.tsv', sep = ""))
```


```{r}
all_kegg_result_significant <- all_kegg_results %>%
  filter(p.adjust < 0.2) %>%
  arrange(p.adjust)
print(all_kegg_result_significant)
```


```{r, fig.width=12, fig.height=6}
library(ggplot2)
library(dplyr)

# Select the top 10 pathways per module
top_results <- all_kegg_result_significant %>%
  group_by(module) %>%
  slice_min(pvalue, n = 15) %>%  # Get the top 10 pathways per module
  ungroup()

# Convert module to factor for better visualization
top_results$module <- as.factor(top_results$module)

options(repr.plot.width = 14, repr.plot.height = 6)  # Wider plot

# Create grouped bar plot
ggplot(top_results, aes(x = reorder(Description, -pvalue), y = -log10(pvalue), fill = module)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  facet_wrap(~ module, scales = "free_y", ncol = 5) +  # Separate plots per module
  labs(
    title = "Top 10 Enriched Pathways per Module",
    x = "Pathway",
    y = "-log10(p-value)",
    fill = "Module"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 11),
    axis.title = element_text(size = 11),
    legend.position = "right"
  ) #+
  #scale_fill_brewer(palette = "Set3")  # Use categorical colors

# Save the plot as a wide image
ggsave(file = paste(outputDir, 'top10_keggs_per_module.pdf', sep = ""), width = 25, height = 10, dpi = 300)

```

```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(pheatmap)

kegg = all_kegg_results

# Drop rows where subcategory is NA
kegg <- kegg %>%
  filter(!is.na(subcategory))

# Filter significant terms (p.adjust < 0.05) and count
ktemp <- kegg %>%
  filter(p.adjust < 0.05) %>%
  count(subcategory, module)

```

```{r}
library(readr)
library(dplyr)
library(tidyr)
# Pivot wider
ktemp_wide <- ktemp %>%
  pivot_wider(names_from = module, values_from = n, values_fill = 0)

# Set rownames and remove 'subcategory' column
ktemp_mat <- ktemp_wide %>%
  column_to_rownames('subcategory')


```


```{r}
library(viridis)


# Important: Make sure all columns are numeric (they are currently character!)
ktemp_mat <- ktemp_mat %>%
  mutate(across(everything(), as.numeric)) %>%
  as.matrix()
```


```{r}

# --- Masking zeros: make them NA ---
ktemp_mat_masked <- ktemp_mat
ktemp_mat_masked[ktemp_mat_masked == 0] <- NA

# --- Sort rows based on increasing order of first module (column "1") ---
# (if you want to sort by a specific module index)
module1 <- "1"  # choose the module you want to sort by
if (module1 %in% colnames(ktemp_mat_masked)) {
  ktemp_mat_masked <- ktemp_mat_masked[order(ktemp_mat_masked[, module1], decreasing = TRUE, na.last = TRUE), ]
}

# --- Plot heatmap ---
pheatmap(
  ktemp_mat_masked,
  color = viridis(100),  # White color for NA (masked)
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  angle_col = 45,       # Slight tilt to x labels
  border_color = "grey90",
  fontsize_row = 9,
  fontsize_col = 10,
  legend_breaks = c(1, 3, 5, 7, 10),
  main = "Significant KEGG Terms",
  cellwidth = 20,       # Force cells to be square
  cellheight = 20       # Force cells to be square
)

```


```{r}

# --- Pivot longer for ggplot ---
ktemp_long <- ktemp %>%
  mutate(module = as.factor(module))  # Make module categorical

# --- Sort subcategories based on NAME alphabetically ---
sorting_order <- ktemp_long %>%
  distinct(subcategory) %>%     # get unique subcategories
  arrange(subcategory) %>%      # sort alphabetically
  pull(subcategory)

# Then use it to reorder
ktemp_long <- ktemp_long %>%
  mutate(subcategory = factor(subcategory, levels = sorting_order))

ktemp_long <- ktemp_long %>%
  mutate(subcategory = factor(subcategory, levels = sorting_order))
# Create heatmap
p <- ggplot(ktemp_long, aes(x = module, y = subcategory, fill = n)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_viridis(option = "D", na.value = "white", name = "Significant terms") +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5),
    axis.text.y = element_text(hjust = 1),
    panel.grid = element_blank(),
    axis.title = element_blank(),
    aspect.ratio = 2,   # <-- force squares
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 8) # smaller axis text for fitting
  ) +
  labs(title = "Significant KEGG Terms by Module")

# Show the plot
print(p)

# Save it, set exact square dimensions (e.g., 7x7 inches)
ggsave(
  filename = "kegg_heatmap_square.pdf",
  plot = p,
  width = 7,    # Adjusted to 7x7 for better aspect ratio
  height = 7,   # Ensuring it's square
  units = "in",
  dpi = 300
)
```
